Load and transform the data for every subject

First, let’s load the Power 2011 region database. This will be used as an “atlas” throughout, to guide the development of the regions.

power2011 <- read_csv("../bin/power_2011.csv", 
                      col_types = cols(ROI=col_double(),
                                       X = col_double(),
                                       Y = col_double(),
                                       Z = col_double(),
                                       Network = col_double(),
                                       Color = col_character(),
                                       NetworkName = col_character())) %>%
  dplyr::select(ROI, X, Y, Z, Network, Color, NetworkName)
## Warning: Missing column names filled in: 'X8' [8], 'X9' [9], 'X10' [10],
## 'X11' [11], 'X12' [12], 'X13' [13], 'X14' [14], 'X15' [15], 'X16' [16],
## 'X17' [17]

Create the Group-Level Regressor Matrix \(X\)

We now need to load the group level data. In essence, to corresponds to create a matrix X in which every individual is a row and every columns is a different ROI-to-ROI connection.

NOFLY <- c()
SUBJS <- c()
cols <- outer(power2011$ROI, power2011$ROI, function(x, y) {paste(x, y, sep="-")})
cols %<>% as.vector

connection <- function(x, y) {
  paste(min(x, y), max(x, y), sep="-")
}

vconnection <- Vectorize(connection)

Mode <- function(x, na.rm=F) {
  if (na.rm) {
    x = x[!is.na(x)]
  }
  ux <- unique(x)
  return(ux[which.max(tabulate(match(x, ux)))])
}

reduced_power2011 <- power2011 %>% 
  dplyr::select(Network, NetworkName) %>%
  group_by(Network) %>%
  summarize(Network = mean(Network), NetworkName = Mode(NetworkName))

connection_name <- function(x, y) {
  first <- min(x, y)
  second <- max(x, y)
  paste(reduced_power2011 %>% filter(Network == first) %>% dplyr::select(NetworkName) ,
        reduced_power2011 %>% filter(Network == second) %>% dplyr::select(NetworkName),
        sep="-")
  
}

vconnection_name <- Vectorize(connection_name)

connection_name2 <- function(x, y) {
  first <- min(x, y)
  second <- max(x, y)
  paste(reduced_power2011$NetworkName[reduced_power2011$Network == first],
        reduced_power2011$NetworkName[reduced_power2011$Network == second],
        sep="-")
  
}

vconnection_name2 <- Vectorize(connection_name2)


nets <- outer(power2011$Network, power2011$Network, vconnection)
nets %<>% as.vector
netnames <- outer(power2011$Network, power2011$Network, vconnection_name2)
netnames %<>% as.vector

n <- length(grep("sub-*", dir("../rsfmri/REST1")))
C <- matrix(data = rep(0, length(cols)*n), nrow =  n)

j <- 1

R <- NULL
PR <- NULL

for (sub in dir("../rsfmri/REST1")[grep("sub-*", dir("../rsfmri/REST1"))]) {
  SUBJS %<>% c(strsplit(sub, "-")[[1]][2])
  M <- paste("../rsfmri/REST1", 
             sub, 
             "ses-01/mr_pcorr.txt", sep="/") %>%
    read_csv(skip = 1,
             col_names = F,
             col_types = cols(
               .default = col_double(),
               X1 = col_character()
             )) %>%
    as.matrix() 
  v <- as_vector(M[,2:265])  # v spreads M column-wise. M is symmetrical, so it should not matter, but better not risk it
  C[j,] <- v
  if (length(v[is.na(v)]) > 0) {
    print(paste("NA detected in sub", sub))
    NOFLY %<>% c(sub)  # Addes sub to NOFLY list
  }
  
  j <- j + 1
}

Define the Networks

If we want, we can restrict the analysis only to a limited set of networks (and their cross-network connections) by modifying the NOI (Networks of Interest) variable. The variable will be used to create a second list, COI (Connections of interest), which will contain the possible list of network-to-network connections for the networks in NOI. (This is currently not needed, since the \(X\) matrix is already restricted to meaningful connections).

NOI <- c(
  "Uncertain",
  "Sensory/somatomotor Hand",
  "Sensory/somatomotor Mouth",
  "Cingulo-opercular Task Control",
  "Auditory",
  "Default mode",
  "Memory retrieval?",
  "Ventral attention",
  "Visual",
  "Fronto-parietal Task Control",
  "Salience",
  "Subcortical",
  "Cerebellar",
  "Dorsal attention"
)

COI <- outer(NOI, 
             NOI, 
             function(x, y) {paste(x, y, sep="-")}) %>% as.vector()

Now, we need to remove some columns from the hyper-large X matrix, and define proper groupings for Lasso.

First, we ensure that the data in the connectivity matrix C is actually numeric.

C <- apply(C, 2, FUN=as.numeric)

Then, we create a set of “censor” vectors, each of each in a binary vector that has the same lenght as the columns of C. If the j-th element of the censor vector is TRUE, the corresponding column in C is kept, otherwise it is removed from the possible regressors.

The first censor vector simply removes the redundant columns (since the connectivity from A to B is the same as the connectivity of B to A) and the self-correlations:

censor <- outer(power2011$ROI, 
                power2011$ROI, 
                function(x, y) {x < y}) %>% as.vector()

The second censor vector removes unlikely functional connections: Those with a partial correlation value \(|r| < 0.05|\).

censor2 <- colMeans(C) %>% abs() > 0.05

Now, we combine the censor vectors in a tibble that contains all of the relevant information about each column in C.

order <- tibble(index = 1:length(nets), 
                network = nets, 
                network_names = netnames,
                connection = cols, 
                censor=censor,
                censor2 = censor2)
order %<>% arrange(network)

And we remove all entries for each a censor vector is FALSE (we also create a grouping factor G, in case in the future we want to use Group Lasso)

I <- order %>%
  filter(censor == TRUE) %>%
  filter(censor2 == TRUE) %>%
  filter(network_names %in% COI) %>%
  dplyr::select(index) 

G <- order %>%
  filter(censor == TRUE) %>%
  filter(network_names %in% COI) %>%
  dplyr::select(network) 
# G is the real grouping factor for Lasso!

As a last step, we create the “real” regressor matrix \(X\), which is the proper subset of \(C\) after removing all of the censored columns.

X <- C[,as_vector(I)]

Load the Dependent Variable \(Y\)

Now we need to load the dependent variable. In this case, it is a binary variable that determines which strategy model best fits the behavioral data of an individual, whether it is the “memory” strategy (\(Y = 1\)) or the “procedural” strategy (\(Y = 2\)).

dvs <- read_csv("../actr-models/model_output/MODELLogLikelihood.csv",
                col_types = cols(
                  .default = col_double(),
                  HCPID = col_character(),
                  best_model = col_character()
                )) %>% 
  dplyr::select(HCPID, best_model) %>%
  arrange(HCPID)
## Warning: Missing column names filled in: 'X1' [1]

Now we select only the rows of \(X\) and the values of \(Y\) for which we have both rsfMRI and model data:

subjs_hcp <- paste(SUBJS, "fnca", sep="_")
common <- intersect(subjs_hcp, dvs$HCPID)
keep_X <- subjs_hcp %in% common
keep_Y <- dvs$HCPID %in% common
Y <- dvs$best_model[keep_Y]
X <- X[keep_X, ]

Finally, we transform the dependent variable \(Y\) into a binary numeric variable with values \((0, 1)\), so that we can use logistic regression.

Y <- as.numeric(as.factor(Y)) - 1

Quality and Characteristics of \(X\) and \(Y\)

Let’s do some visualization and analysis of our indepedenent and dependet variables, just to ensure there are no obvious problems.

Collinearity of Connectivity Regressors \(X\)

The regressors \(X\) is certainly multi-collinear; that is a consequence of having a large number of predictors \(p > n\), which, in turn, is one of the reasons why we are using Lasso. Too much collinearity, however, could be really bad and push Lasso towards selecting non-optimal regressors. To gather a sense of how much collinearity we have, we can plot the distribution of correlations among regressors:

corX <- cor(X)
distCor <- as_vector(corX[lower.tri(corX, diag = F)])
distTibble <- as_tibble(data.frame(R=distCor))

ggplot(distTibble, aes(x=R)) +
  geom_histogram(col="white", alpha=0.5, binwidth = 0.05) +
  theme_pander() +
  ylab("Number of Correlations") +
  xlab("Correlation Value") +
  ggtitle("Distribution of Correlation Values Between Regressors")

All in all, the collinearity is not that bad—all regressors are correlated at \(|r| < 0.25\), and most of them are correlated at \(|r| < 0.1\), with a peak at \(r = 0\).

Distribution of Classes

And now, let’s visualize the histogram of the dependent variable we are trying to predict:

dependent <- as_tibble(data.frame(strategy=Y))

ggplot(dependent, aes(x = strategy, fill=as.factor(strategy))) +
  geom_histogram(bins=8, col="white", alpha=0.5) +
  scale_fill_aaas() +
  xlab("Strategy") +
  ylab("Number of Participants") +
  ggtitle("Distribution of Strategy Use") +
  theme_pander() +
  theme(legend.position = "none")

Because the classes are not equally distributed, and participants are more likely to use the memory strategy (\(Y=0\)) than the procedural one (\(Y = 1\)), we would need to adjust the weights of our Lasso model.

Machine-Learning with Lasso

To analyzie the data, we will use Lasso, a statitical learning system based on penalyzed regression.

Weights

Most of the entries in our \(Y\) vector are coded as “0” (i.e., most poarticipants use the memory strategy), which creates an imbalance. We are going to create an appropriate set of weights so that the two classes are balanced.

W <- Y
W[W == 0] <- mean(Y)
W[W == 1] <- (1-mean(Y))

Defining the model

We can now define the lasso model. We will use the elastic net approach with \(\alpha = 0\) to generate a pure lasso model. The model will use a binomial (i.e., logistic) regression and will measure the cross-validation error as class misassignment.

fit <- glmnet(y = Y,
              x = X,
              alpha=1,
              weights = W,
              family = "binomial",
              type.measure = "class",
              standardize = T
)

To choose the optimal value of \(\lambda\) in Lasso, we will examine the cross-validation error during a LOO cross-validation.

fit.cv <- cv.glmnet(y = Y,
                    x = X,
                    alpha=1,
                    family = "binomial",
                    weights = W,
                    type.measure = "class",
                    standardize=T,
                    nfolds=length(Y),
                    grouped = F
)

Now, let’s look at the cross-validation error profile.

plot(fit.cv)

The profile has the characteristic U-shape or increasing curve, with more error as \(\lambda\) increases. As recommended by Tibishirani, we will select the “lambda 1SE” value, which is the largest \(\lambda\) value that does not differ by more tha 1 SE from the \(\lambda\) value that gives us the minimum cross validation error. This guarantees the maximum generalizability.

We can also visualize the profile of the beta weights of each connection for different values of \(\lambda\).

plot(fit, sub="Beta Values for Connectivity")

L1norm <- sum(abs(fit$beta[,which(fit$lambda==fit.cv$lambda.1se)]))
abline(v=L1norm, lwd=2, lty=2)

And now, plot prettier version

lasso_df <- as_tibble(data.frame(lambda=fit.cv$lambda, 
                                 error=fit.cv$cvm, 
                                 sd=fit.cv$cvsd))

ggplot(lasso_df, aes(x=lambda, y=error)) +
  geom_line(aes(col=error), lwd=2) +
  scale_color_viridis("Error", option = "plasma") +
  geom_ribbon(aes(ymin=error -sd, ymax=error + sd), alpha=0.2,fill="blue") +
  xlab(expression(lambda)) +
  ylab("Cross-Validation Error") +
  ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
  geom_vline(xintercept = lasso_df$lambda[lasso_df$error==min(lasso_df$error)]) +
  theme_pander() +
  theme(legend.position="right")

Examining the Predictive Connectome

Let’s have a better look at the relevant connections that survive the Lass penalty at the value of \(\lambda_{min} + 1 SE\). We start by saving these connections in a tibble, and saving the data on a file for later use.

betas <- fit$beta[, which(fit$lambda==fit.cv$lambda.1se)]
conn_betas <- as_tibble(data.frame(index=I$index, Beta=betas))
connectome <- order %>%
  filter(index %in% I$index) %>%
  inner_join(conn_betas) %>%
  dplyr::select(-censor2) %>%
  filter(Beta != 0) %>%
  arrange(index)
## Joining, by = "index"
write_csv(connectome, file="strategy_mr.csv")
save(fit, fit.cv, X, Y, order, I, G, file="strategy_mr.RData")

Finally, we can visualize the table of connections

connectome %>%
  xtable() %>%
  kable(digits = 5) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
index network network_names connection censor Beta
8206 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 22-32 TRUE -1.89086
8207 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 23-32 TRUE 2.30505
9264 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 24-36 TRUE -2.03310
10063 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 31-39 TRUE 2.81201
16959 4-4 Auditory-Auditory 63-65 TRUE 0.53776
18017 4-4 Auditory-Auditory 65-69 TRUE 4.03346
18020 4-4 Auditory-Auditory 68-69 TRUE -0.26776
18261 2-4 Sensory/somatomotor Mouth-Auditory 45-70 TRUE 2.48728
18274 3-4 Cingulo-opercular Task Control-Auditory 58-70 TRUE 0.04146
18282 4-4 Auditory-Auditory 66-70 TRUE -0.11546
21995 -1-5 Uncertain-Default mode 83-84 TRUE 0.68047
24368 5-5 Default mode-Default mode 80-93 TRUE 1.69981
24906 5-5 Default mode-Default mode 90-95 TRUE -3.74493
24907 5-5 Default mode-Default mode 91-95 TRUE -5.51874
26222 5-5 Default mode-Default mode 86-100 TRUE -0.04432
28059 5-5 Default mode-Default mode 75-107 TRUE -1.74162
28323 5-5 Default mode-Default mode 75-108 TRUE -2.93711
31420 -1-5 Uncertain-Default mode 4-120 TRUE 0.37132
31534 5-5 Default mode-Default mode 118-120 TRUE 0.52838
32816 5-5 Default mode-Default mode 80-125 TRUE 4.46703
33124 5-5 Default mode-Default mode 124-126 TRUE -0.50586
34942 5-6 Default mode-Memory retrieval? 94-133 TRUE 1.66036
35205 5-6 Default mode-Memory retrieval? 93-134 TRUE -2.41717
35469 5-6 Default mode-Memory retrieval? 93-135 TRUE -1.37214
36007 5-5 Default mode-Default mode 103-137 TRUE -1.63355
37613 5-7 Default mode-Visual 125-143 TRUE 5.99955
40807 7-7 Visual-Visual 151-155 TRUE -2.08989
41075 7-7 Visual-Visual 155-156 TRUE -3.04218
41328 7-7 Visual-Visual 144-157 TRUE 0.52580
41864 7-7 Visual-Visual 152-159 TRUE 0.89705
42660 7-7 Visual-Visual 156-162 TRUE 1.58602
42927 7-7 Visual-Visual 159-163 TRUE -11.65413
43186 7-7 Visual-Visual 154-164 TRUE 2.06833
43458 7-7 Visual-Visual 162-165 TRUE 1.08723
44501 7-7 Visual-Visual 149-169 TRUE -0.83859
44514 7-7 Visual-Visual 162-169 TRUE -3.29812
45566 7-7 Visual-Visual 158-173 TRUE 1.09878
45810 8-11 Fronto-parietal Task Control-Ventral attention 138-174 TRUE 2.70591
47700 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 180-181 TRUE -1.06966
48220 -1-7 Uncertain-Visual 172-183 TRUE 0.97763
48759 -1–1 Uncertain-Uncertain 183-185 TRUE 0.31034
49015 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 175-186 TRUE -1.74089
50082 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 186-190 TRUE -4.20237
51142 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 190-194 TRUE -1.14423
52464 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 192-199 TRUE -2.95918
53356 1-9 Sensory/somatomotor Hand-Salience 28-203 TRUE 1.67191
53422 5-9 Default mode-Salience 94-203 TRUE 2.93233
54042 8-9 Fronto-parietal Task Control-Salience 186-205 TRUE -4.51978
54221 5-9 Default mode-Salience 101-206 TRUE -1.41648
55120 9-9 Salience-Salience 208-209 TRUE 6.91535
55500 3-9 Cingulo-opercular Task Control-Salience 60-211 TRUE 3.23332
55649 9-9 Salience-Salience 209-211 TRUE -1.12886
56708 9-9 Salience-Salience 212-215 TRUE -0.09215
57240 9-9 Salience-Salience 216-217 TRUE -0.91650
57770 9-9 Salience-Salience 218-219 TRUE -0.45811
61041 3-10 Cingulo-opercular Task Control-Subcortical 57-232 TRUE -0.28020
61745 10-10 Subcortical-Subcortical 233-234 TRUE -0.15582
62630 4-11 Auditory-Ventral attention 62-238 TRUE -2.21331
63756 -1-11 Uncertain-Ventral attention 132-242 TRUE -0.75419
64071 -1-13 Uncertain-Cerebellar 183-243 TRUE -4.11193
66780 -1-12 Uncertain-Dorsal attention 252-253 TRUE 3.47463
67242 1-8 Sensory/somatomotor Hand-Fronto-parietal Task Control 186-255 TRUE -0.60621
67260 1-9 Sensory/somatomotor Hand-Salience 204-255 TRUE -0.26895
67836 12-12 Dorsal attention-Dorsal attention 252-257 TRUE -0.14686
67838 -1-12 Uncertain-Dorsal attention 254-257 TRUE 0.00521
67873 1-12 Sensory/somatomotor Hand-Dorsal attention 25-258 TRUE 1.37378
68814 8-12 Fronto-parietal Task Control-Dorsal attention 174-261 TRUE -2.42184
69218 3-12 Cingulo-opercular Task Control-Dorsal attention 50-263 TRUE -2.01954
69461 1-12 Sensory/somatomotor Hand-Dorsal attention 29-264 TRUE 0.33782

And we can do some statistics. For example, which networks do these regions and connections belong to? Are they different from what would be expected from a random sample of connections from the connectome?

region_from <- function(s) {as.numeric(strsplit(s, "-")[[1]][1])}
region_to <- function(s) {as.numeric(strsplit(s, "-")[[1]][2])}

vregion_from <- Vectorize(region_from)
vregion_to <- Vectorize(region_to)

lROIs <- unique(c(vregion_from(connectome$connection),
                  vregion_to(connectome$connection)))

rois <- power2011[lROIs,]
roi_stats <- rois %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/length(lROIs)) %>%
  add_column(Source="Lasso")
## `summarise()` has grouped output by 'NetworkName'. You can override using the `.groups` argument.
subsetPower <- filter(power2011,
                      NetworkName %in% NOI)

noi_stats <- subsetPower %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/dim(subsetPower)[1]) %>%
  add_column(Source="Power")
## `summarise()` has grouped output by 'NetworkName'. You can override using the `.groups` argument.
total_stats <- rbind(roi_stats, noi_stats)

ggplot(total_stats, aes(x="", y=N, fill=NetworkName)) +
  geom_bar(stat = "identity", col="white", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  #scale_fill_viridis(discrete = T) +
  scale_fill_ucscgb() +
  ylab("") +
  xlab("") +
  ggtitle("Distriution of Regions") +
  geom_text_repel(aes(label=percent(N, .01)), col="black",
            position=position_stack(vjust=1), size=3)+
  theme_pander() +
  guides(fill=guide_legend(ncol=2)) +
  theme(legend.position = "bottom")

There is no difference in the distribution of ROIs:

chisq.test(roi_stats$N*length(lROIs), p = noi_stats$N)
## Warning in chisq.test(roi_stats$N * length(lROIs), p = noi_stats$N): Chi-squared
## approximation may be incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  roi_stats$N * length(lROIs)
## X-squared = 13.267, df = 13, p-value = 0.4274

And now, let’s look at the sparsity

net_from <- function(s) {as.character(strsplit(s, "-")[[1]][1])}
net_to <- function(s) {as.character(strsplit(s, "-")[[1]][2])}

vnet_from <- Vectorize(net_from)
vnet_to <- Vectorize(net_to)

connectivity <- function(s) {
  if (net_from(s) == net_to(s)) {
    "Within"
  } else {
    "Between"
  }
}

vconnectivity <- Vectorize(connectivity)

coi <- order %>%
  filter(censor == TRUE) %>%
  filter(network_names %in% COI) 

coi$from <- vnet_from(coi$network_names)
coi$to <- vnet_to(coi$network_names)
coi$connection_type <- vconnectivity(coi$network_names)

coi_stats <- coi %>% 
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(coi)[1]) %>%
  add_column(Source = "Power et al. (2011)")
  

connectome$connection_type <- vconnectivity(connectome$network_names)
connectome_stats <- connectome %>%
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(connectome)[1]) %>%
  add_column(Source = "Lasso Analysis")
  

total_stats2 <- rbind(connectome_stats, coi_stats)

ggplot(total_stats2, aes(x="", y=P, fill=connection_type)) +
  geom_bar(stat = "identity", col="grey", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  scale_fill_jama() +
  scale_color_jama() +
  ylab("") +
  xlab("") +
  ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
  geom_text_repel(aes(label=percent(P, .1)), col="white",
            position=position_stack(vjust=1), size=3)+
  theme_pander() +
  theme(legend.position = "bottom")

The differenece in the two distributions is significant.

chisq.test(connectome_stats$N, p=coi_stats$P)
## 
##  Chi-squared test for given probabilities
## 
## data:  connectome_stats$N
## X-squared = 152.61, df = 1, p-value < 2.2e-16

Connectivity by Region

Finally, we can look at how many connectionseach region belongs to:

rois <- c(vregion_from(connectome$connection), vregion_to(connectome$connection))
rois_t <- tibble(roi = rois)
rois_t %>%
  group_by(roi) %>%
  summarize(N = length(roi)) %>%
  arrange(N) -> rois_dist

rois_dist %>%
  xtable() %>%
  kable(digits = 5) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
roi N
4 1
22 1
23 1
24 1
25 1
28 1
29 1
31 1
36 1
39 1
45 1
50 1
57 1
58 1
60 1
62 1
63 1
66 1
68 1
83 1
84 1
86 1
90 1
91 1
100 1
101 1
103 1
107 1
108 1
118 1
124 1
126 1
132 1
133 1
134 1
135 1
137 1
138 1
143 1
144 1
149 1
151 1
152 1
154 1
157 1
158 1
163 1
164 1
165 1
172 1
173 1
175 1
180 1
181 1
185 1
192 1
194 1
199 1
204 1
205 1
206 1
208 1
212 1
215 1
216 1
217 1
218 1
219 1
232 1
233 1
234 1
238 1
242 1
243 1
253 1
254 1
258 1
261 1
263 1
264 1
32 2
65 2
69 2
75 2
80 2
94 2
95 2
120 2
125 2
155 2
156 2
159 2
169 2
174 2
190 2
203 2
209 2
211 2
252 2
255 2
257 2
70 3
93 3
162 3
183 3
186 4

And visualize

ggplot(rois_dist, aes(x=N)) +
  geom_histogram(alpha=0.5, col="white", 
                 binwidth = 1) +
  ggtitle("Distribution of Number of Connections per ROI") +
  stat_bin(binwidth= 1, 
           geom="text", 
           aes(label = paste("N =", ..count..)), 
           vjust = -1) +
  ylim(0, 100) +
  theme_pander()

Cross Validated Predictions

How well is the model doing? To investigate this, we can hand-craft a Leave-One Out regression model and save the predicted values of rate of forgetting as well as the recorded beta weights.

dfX <- data.frame(cbind(Y, X[,betas != 0]))
numP <- ncol(X[, betas != 0])  
numO <- length(Y)
names(dfX) <- c("Y", paste("X", 1:numP, sep=""))

Yp <- rep(0, numO)  # Vector of zeros the size of Y 
Xe <- matrix(rep(0, numP * numO), 
             ncol = numP)  # Matrix of zeros the dimensions of X

for (i in seq(1, length(Y))) {
  subdfX <- dfX[-i,]
  lmod<-lm(Y ~ . + 1, as.data.frame(subdfX))
  
  yp <- predict(object=lmod, 
                newdata=dfX[i, 2:(numP + 1)])
  Yp[i] <- yp
  Xe[i,] <- lmod$coefficients[2:(numP + 1)]
}

Predicted vs. Observed

Now, let’s do a real predicted vs. observed graph:

wcomparison <- tibble(Observed = Y,
                      Predicted = Yp,
                      DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
              
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
                                          "Correct", 
                                          "Misclassified"))

rval <- floor(100*cor(Y, Yp))/100

p <- ggplot(wcomparison, aes(x=Predicted, y=Observed, 
                             col=Accuracy)) +
  geom_point(size=4, alpha=0.6, 
             position= position_jitter(height = 0.02)) +
  geom_abline(intercept = 0, slope = 1, 
              col="red",
              linetype="dashed") +
  scale_color_d3() +
  theme_pander() +

  theme(legend.position = "right") +
  guides(col=guide_legend("Classification")) +
  coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate("text", x=0.3, y=0.7,
           label=paste("r(",
                       length(Y),
                       ") = ",
                       rval,
                       ", p < 0.001",
                       sep="")) +
  ylab("Observed Strategy") +
  xlab("Predicted Strategy") +
  ggtitle("Decision Strategy:\nCross-Validation") +
  theme(legend.position = "bottom")
  
ggMarginal(p, 
           fill="grey", 
           alpha=0.75,
           type="density", #bins=13, 
           col="darkgrey",
           margins = "both")

ROC

And now, ROC to get the accuracy.

wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))

rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
g <- ggroc(rocobj, col="red") +
  geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
  ggtitle("ROC Curve for Model") +
  xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") + 
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
  theme_pander()

g

The final accuracy of the prediction model is 0.9290805

ROC Curve By Sliding Threshold

To assess the robustness of our threshold, we can plot a ROC curve with specifities and sensitivities for sliding values of the prediction threshold.

curve <- NULL

for (threshold in seq(0, 1, 0.01)) {
  subthreshold <- wcomparison %>%
    mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
    mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
    mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
    mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
    group_by(Observed) %>%
    summarise(Accuracy = mean(Accuracy))
  
  tnr <- subthreshold %>% 
    filter(Observed == 0) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  tpr <- subthreshold %>% 
    filter(Observed == 1) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  partial <- tibble(Threshold = threshold,
                    TNR = tnr,
                    TPR = tpr)
  if (is.null(curve)) {
    curve <- partial
  } else {
    curve <- rbind(curve, partial)
  }
}

And now, we can visualize the discrete ROC:

ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) + 
  geom_point(size=2, col="red", alpha=0.5) + 
  geom_line(col="red") + 
  ylab("Sensitivity (True Positive Rate)") +
  xlab("Specificity (True Negative Rate)") +
  scale_x_reverse() +
  # ylim(0, 1) +
  # xlim(1, 0) +
  ggtitle("ROC Curve for Different Thresholds") +
  geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
  theme_pander()

Stability of Estimated Beta Weights

And now, let’s visualize the beta weights of the connections

colnames(Xe) <- paste("Connection", 1:nrow(connectome), paste="")
wconnections <- as_tibble(Xe)
lconnections <- pivot_longer(wconnections, cols=colnames(Xe), 
                             names_to="Connection", values_to = "Beta")

connectome <- connectome %>% arrange(Beta)

ggplot(lconnections, aes(x = reorder(Connection, Beta), y = Beta)) +
  geom_point(aes(col=Beta), alpha=.5, 
             size=2,
             position = position_jitter(height = 0, width = 0.3)) +
  stat_summary(fun.data = "mean_sdl", geom="point", fill="black", alpha=1, size=1) +
  scale_color_gradient2(low = "dodgerblue",
                        mid = "wheat",
                        high = "red2",
                        midpoint = 0) +
  scale_x_discrete(labels = 
                     paste(connectome$network_names, 
                           " (", 
                           connectome$connection,
                           ")", sep="")) +
  ggtitle("Connection Weights\nAcross Cross-Validation") +
  ylab(expression(paste(beta, " value"))) +
  xlab("Connection") +
  geom_hline(yintercept = 0, col="grey") +
  stat_summary(fun.data = "mean_cl_boot", 
               col="black", geom="errorbar", width=1) +
  #scale_color_viridis(option="plasma", begin=0.2, end=0.9) +
  
  theme_pander() +
  theme(axis.text.y = element_text(angle=0, hjust=1),
        legend.position = "NA") +
  #ylim(-3, 3) +
  coord_flip()

Testing the validity of the Lasso model

Here, we will examine the quality of our Lasso model bu doing a series of tests.

Ablation test

In the ablation test, we remove all the connections with significant beta values, and check whether the results are still significant.

XX <- X[, conn_betas$Beta == 0]

fit_wo <- glmnet(y = Y,
                 x = XX,
                 alpha=1,
                 lambda = fit$lambda,
                 family = "binomial",
                 type.measure = "class",
                 weights = W,
                 standardize = T
)

fit_wo.cv <- cv.glmnet(y = Y,
                       x = XX,
                       alpha=1,
                       weights = W,
                       lambda = fit$lambda,
                       standardize=T,
                       type.measure = "class",
                       family = "binomial",
                       grouped=F,
                       nfolds=length(Y)
)

The model does converge, but its overall classification error is much higher.

plot(fit_wo, sub="Beta Values for Connectivity")
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
L1norm <- sum(abs(fit_wo$beta[,which(fit_wo$lambda==fit_wo.cv$lambda.1se)]))
abline(v=L1norm, lwd=2, lty=2)

It is useful to plot the two \(\lambda\)-curves (with and without the relevant connections) on the same plot.

lasso_df_wo <- tibble(lambda=fit_wo.cv$lambda, 
                   error=fit_wo.cv$cvm, 
                   sd=fit_wo.cv$cvsd)



lasso_df$Model <- "Full Model"
lasso_df_wo$Model <- "Without the Selected Connections"

lasso_uber <- rbind(lasso_df, lasso_df_wo)

ggplot(lasso_uber, aes(x = lambda, y = error, fill=Model)) +
  scale_color_d3() +
  scale_fill_d3()+
  geom_ribbon(aes(ymin = error - sd, 
                  ymax = error + sd), 
              alpha = 0.5,
              #fill="blue"
              ) +
  geom_line(aes(col=Model), lwd=2) +
  xlab(expression(lambda)) +
  ylab("Cross-Validation Error") +
  ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
  geom_vline(xintercept = fit.cv$lambda.1se,
             linetype="dashed") +
  theme_pander() +
  theme(legend.position="bottom")

Variance Inflation Factor

Then, we examine the Variance Inflation Factor (VIF). To calculate the VIF, we need to first create a linear model of the factor effects:

dfX <- data.frame(cbind(Y, X[,betas != 0]))
mod<-lm(Y ~ . + 1, as.data.frame(dfX))

We can now calculate the VIF and turn the results into a tibble:

vifs <- vif(mod)
vifsT <- tibble(VIF = vifs)

And, finally, we can plot an histogram of the distribution of VIF values. VIFs values < 10 are considered non-collinear; VIFs values < 5 are great. All of our factors have VIF values that a re much smaller than 5, which implies that they are as close to a normal basis set as possible.

ggplot(vifsT, aes( x =VIF)) +
  geom_histogram(col="white", binwidth = 0.1, fill="blue", alpha=0.4) +
  theme_pander() +
  xlab("VIF Value") +
  ylab("Number of Predictors") +
  ggtitle("Distribution of Variance Inflation Factors")

Nested Cross-Validation

Standard LOOV cross-validation suffers from data leakage: Since the same data is use for both validation (hyper-parameter fitting, in this case, the \(\lambda\) parameter) and testing, success on testing is biased by the fact that validation was performed on the same sample.

The canonical way to handle this problem is to have separate train-validate-test groups. This, however, runs into the problem of dividing the data properly and having a reduced sample.

An alternative solutio is to do nested cross-validation. In this case, the data is analyzed with two LOOV loops: at every turn, one subject is left out as a test subject, and the remaining N-1 are used for training and validation. This group is then again splitting one participant out for valiation and the remaining N-2 for training. The inner loop is performed until the validation is complete, at which point we use the hyper-parameters that guaratee best generalization to test the model on the the original, left-out participant. The process is repeated for every participant.

Note that different model (an a different profile of \(\lambda\)) is generated for every participat. To obtaine the bestmodel, we select first select the \(\lambda_{1SE}\) value. If there are no regressors left (i.e., \(N(\beta \neq 0) = 0\)), we select \(\lambda_{min}\). If we still have no regressors, we select the largest value of \(\lambda\) for which at least one regressor is left \(\neq0\).

N <- length(Y)
P <- ncol(X)
betas <- matrix(rep(0, P * N), nrow = N)
Yp <- rep(0, N)
minF = 1
X <- atanh(X)
#dfX <- as.data.frame(cbind(Y, X))
for (i in 1:N) {
  Ytrain <- Y[-i]
  Xtrain <- X[-i,]
  Wtrain <- W[-i]
  # fit <- ncvreg
  fit <- glmnet(y = Ytrain,
                x = Xtrain,
                weights = Wtrain,
                alpha = 1,
                family = "binomial",
                type.measure ="class",
                standardize = T
  )
  
  fit.cv <- cv.glmnet(y = Ytrain,
                      x = Xtrain,
                      alpha=1,
                      weights=Wtrain,
                      #penalty="SCAD",
                      family = "binomial",
                      type.measure = "class",
                      standardize=T,
                      grouped=F,
                      #nfolds=5
                      nfolds=length(Ytrain)
  )
  
  lambda <- fit.cv$lambda.min
  nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.min)]
  
  if (fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.1se)] > 0) {
    lambda <- fit.cv$lambda.1se
    nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.1se)]
  }
  
  if (nzero < minF) {
    # If no features, select a less-generalizable lambda
    lambda <- fit.cv$lambda[which(fit.cv$nzero >= minF)[1]]
  } 
  
  B <- fit$beta[,which(fit$lambda==lambda)]
  #B <- fit$beta[,which(fit$lambda==fit$lambda[60])]
  #print(B)
  #print(fit.cv$lambda.min)
  #plot(fit.cv)
  if (length(B[B!=0])) {
    #print(c(i, length(B[B!=0])))
    dfX <- data.frame(cbind(Y, X[, B != 0]))
    lmod<-lm(Y ~ . + 1, data=dfX[-i,])
    #print(lmod)
    #Xtest <- dfX[i,-1]
    yp <- lmod$coefficients[1] + sum(B*X[i,])
  } else {
    
    yp <- mean(Ytrain)
  }
  betas[i,] <- B
  Yp[i] <- yp
}

One of the problems of nested CV is that a different model is tested for each participant. To get a sense of the most predictive regressors, we can simply average across them:

uB <- colMeans(betas)
conn_betas <- as_tibble(data.frame(index=I$index, Beta=uB))
connectome_nested <- order %>%
  filter(index %in% I$index) %>%
  inner_join(conn_betas) %>%
  dplyr::select(-censor2) %>%
  filter(Beta != 0) %>%
  arrange(index)
## Joining, by = "index"
write_csv(connectome_nested, file="strategy_mr_nestd.csv")

Predicted vs. Observed

Now, let’s do a real predicted vs. observed graph:

wcomparison <- tibble(Observed = Y,
                      Predicted = Yp,
                      DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
              
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
                                          "Correct", 
                                          "Misclassified"))

rval <- floor(100*cor(Y, Yp))/100

p <- ggplot(wcomparison, aes(x=Predicted, y=Observed, 
                             col=Accuracy)) +
  geom_point(size=4, alpha=0.6, 
             position= position_jitter(height = 0.02)) +
  geom_abline(intercept = 0, slope = 1, 
              col="red",
              linetype="dashed") +
  scale_color_d3() +
  theme_pander() +

  theme(legend.position = "right") +
  guides(col=guide_legend("Classification")) +
  coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate("text", x=0.3, y=0.7,
           label=paste("r(",
                       length(Y),
                       ") = ",
                       rval,
                       ", p < 0.001",
                       sep="")) +
  ylab("Observed Strategy") +
  xlab("Predicted Strategy") +
  ggtitle("Decision Strategy:\nCross-Validation") +
  theme(legend.position = "bottom")
  
ggMarginal(p, 
           fill="grey", 
           alpha=0.75,
           type="density", #bins=13, 
           col="darkgrey",
           margins = "both")

ROC

And now, ROC to get the accuracy.

wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))

rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
g <- ggroc(rocobj, col="red") +
  geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
  ggtitle("ROC Curve for Model") +
  xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") + 
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
  theme_pander()

g

The final accuracy of the prediction model is 0.8312188

ROC Curve By Sliding Threshold

To assess the robustness of our threshold, we can plot a ROC curve with specifities and sensitivities for sliding values of the prediction threshold.

curve <- NULL

for (threshold in seq(0, 1, 0.01)) {
  subthreshold <- wcomparison %>%
    mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
    mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
    mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
    mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
    group_by(Observed) %>%
    summarise(Accuracy = mean(Accuracy))
  
  tnr <- subthreshold %>% 
    filter(Observed == 0) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  tpr <- subthreshold %>% 
    filter(Observed == 1) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  partial <- tibble(Threshold = threshold,
                    TNR = tnr,
                    TPR = tpr)
  if (is.null(curve)) {
    curve <- partial
  } else {
    curve <- rbind(curve, partial)
  }
}

And now, we can visualize the discrete ROC:

ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) + 
  geom_point(size=2, col="red", alpha=0.5) + 
  geom_line(col="red") + 
  ylab("Sensitivity (True Positive Rate)") +
  xlab("Specificity (True Negative Rate)") +
  scale_x_reverse() +
  # ylim(0, 1) +
  # xlim(1, 0) +
  ggtitle("ROC Curve for Different Thresholds") +
  geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
  theme_pander()